install.packages to get everything you need to “knit” the code in “for_pirates.txt”, below..Rmd file which
.html.Rmd AND “knitted”.html files
Each group member must be able to submit this assignment as created from their own computer. If only some members of the group submit the required files, those group members must additionally provide a supplemental explanation along with their submission as to why other students in their group have not completed this assignment.
# https://www.ted.com/talks/hans_rosling_the_best_stats_you_ve_ever_seen
# https://www.datanovia.com/en/blog/gganimate-how-to-create-plots-with-beautiful-animation-in-r/
library('ggplot2')
library('gganimate')
theme_set(theme_bw())
library('gapminder')
library('gifski')
p <- ggplot(
gapminder,
aes(x=gdpPercap, y=lifeExp, size=pop, colour=country)
) +
geom_point(show.legend=FALSE, alpha=0.7) +
scale_color_viridis_d() +
scale_size(range = c(2, 12)) +
scale_x_log10() +
labs(x = "GDP per capita", y = "Life expectancy")
p + facet_wrap(~continent) +
transition_time(year) +
view_follow(fixed_y = TRUE) +
labs(title = "Year: {frame_time}")
conda create --name <name_your_environment> (or another means) to create an environmentconda activate <name_of_your_environment> (or another means) and conda install <package_name> (or another means) to install jupyter notebooks and whatever other libraries you require in order to run the code in “for_snakes.txt” in a jupyter notebook..ipynb which.ipynb “jupyter notebook” file AND and .html export of the notebook.
Each group member must be able to submit this assignment as created from their own computer. If only some members of the group submit the required files, those group members must additionally provide a supplemental explanation along with their submission as to why other students in their group have not completed this assignment.
# https://matplotlib.org/3.1.0/gallery/animation/unchained.html
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import HTML
from matplotlib import animation, rc
rc('animation', html='html5')
%matplotlib notebook
# Fixing random state for reproducibility
np.random.seed(19680801)
# Create new Figure with black background
fig = plt.figure(figsize=(8, 8), facecolor='black')
# Add a subplot with no frame
ax = plt.subplot(111, frameon=False)
# Generate random data
data = np.random.uniform(0, 1, (64, 75))
X = np.linspace(-1, 1, data.shape[-1])
G = 1.5 * np.exp(-4 * X ** 2)
# Generate line plots
lines = []
for i in range(len(data)):
# Small reduction of the X extents to get a cheap perspective effect
xscale = 1 - i / 200.
# Same for linewidth (thicker strokes on bottom)
lw = 1.5 - i / 100.0
line, = ax.plot(xscale * X, i + G * data[i], color="w", lw=lw)
lines.append(line)
# Set y limit (or first line is cropped because of thickness)
ax.set_ylim(-1, 70)
# No ticks
ax.set_xticks([])
ax.set_yticks([])
# 2 part titles to get different font weights
ax.text(0.5, 1.0, "MATPLOTLIB ", transform=ax.transAxes,
ha="right", va="bottom", color="w",
family="sans-serif", fontweight="light", fontsize=16)
ax.text(0.5, 1.0, "UNCHAINED", transform=ax.transAxes,
ha="left", va="bottom", color="w",
family="sans-serif", fontweight="bold", fontsize=16)
def update(*args):
# Shift all data to the right
data[:, 1:] = data[:, :-1]
# Fill-in new values
data[:, 0] = np.random.uniform(0, 1, len(data))
# Update data
for i in range(len(data)):
lines[i].set_ydata(i + G * data[i])
# Return modified artists
return lines
# Construct the animation, using the update function as the animation director.
anim = animation.FuncAnimation(fig, update, interval=50)
#HTML(anim.to_html5_video())
anim
rm(list=ls())#data(<dataset>)
head(), dim())CMD-RETURN / CMD-SHIFT-RETURNCTRL-C? versus “google searches”# rm(list=ls())
data()
data("USArrests")
head(USArrests)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
dim(USArrests)
## [1] 50 4
#?USArrests
# summary(USArrests)
CMD-ENTER
$merge()!)
<- to assign variables-> …plot(USArrests)
pairs(USArrests, panel=function(x,y){smoothScatter(x,y,add=T)})
boxplot(USArrests)
plot(density(USArrests$Murder), xlim=c(min(USArrests),max(USArrests)),
main="KDE")
polygon(density(USArrests$Murder), lty=1, col='black')
polygon(density(USArrests$Assault), lty=2, col='red')
polygon(density(USArrests$UrbanPop), lty=3, col='blue')
polygon(density(USArrests$Rape), lty=4, col='green')
legend('topright', legend=c("Murder","Assult","UrbanPop","Rape"),
lty=1:4, col=c('black','red','blue','green'))
data("state")
state <- data.frame(region=state.region)
rownames(state) <- state.name
USArrests.V2 <- merge(USArrests, state, by="row.names")
library('lattice')
histogram(~UrbanPop|region, data=USArrests.V2, type="count", layout=c(2,2))
factor data type
levels()NE <- levels(USArrests.V2$region)[1]
S <- levels(USArrests.V2$region)[2]
# Disclaimer: we're just demonstrating available functionality with all these
# tests here -- more advanced relevant considerations are being ignored for now
t.test(USArrests$Assault[USArrests.V2$region==NE],
USArrests$Assault[USArrests.V2$region==S])
##
## Welch Two Sample t-test
##
## data: USArrests$Assault[USArrests.V2$region == NE] and USArrests$Assault[USArrests.V2$region == S]
## t = -3.2762, df = 18.709, p-value = 0.004034
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -153.02208 -33.64458
## sample estimates:
## mean of x mean of y
## 126.6667 220.0000
wilcox.test(USArrests$Assault[USArrests.V2$region==NE],
USArrests$Assault[USArrests.V2$region==S])
##
## Wilcoxon rank sum exact test
##
## data: USArrests$Assault[USArrests.V2$region == NE] and USArrests$Assault[USArrests.V2$region == S]
## W = 25, p-value = 0.006547
## alternative hypothesis: true location shift is not equal to 0
histogram(~Assault|region, data=USArrests.V2, type="count", layout=c(2,2))
# another version to see if SouthWest differs from the rest
# won't discuss this one in class for time
AssultsAboveMed <- USArrests$Assault>median(USArrests$Assault)
SouthWest <- USArrests.V2$region %in% levels(USArrests.V2$region)[c(2,4)]
table(SouthWest,AssultsAboveMed)
## AssultsAboveMed
## SouthWest FALSE TRUE
## FALSE 16 5
## TRUE 10 19
fisher.test(SouthWest,AssultsAboveMed)
##
## Fisher's Exact Test for Count Data
##
## data: SouthWest and AssultsAboveMed
## p-value = 0.004697
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.491363 26.881754
## sample estimates:
## odds ratio
## 5.840385
lm.fit <- lm(Rape~region+UrbanPop+Assault+Murder, data=USArrests.V2)
summary(lm.fit)
##
## Call:
## lm(formula = Rape ~ region + UrbanPop + Assault + Murder, data = USArrests.V2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.1752 -1.8764 -0.1118 3.2118 12.8219
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.51369 4.01709 -0.875 0.38660
## regionSouth -1.56471 2.78906 -0.561 0.57770
## regionNorth Central 4.48974 2.23786 2.006 0.05114 .
## regionWest 11.01689 2.23921 4.920 1.31e-05 ***
## UrbanPop 0.12176 0.05721 2.128 0.03908 *
## Assault 0.02793 0.01585 1.762 0.08516 .
## Murder 1.09850 0.32889 3.340 0.00174 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.932 on 43 degrees of freedom
## Multiple R-squared: 0.7566, Adjusted R-squared: 0.7227
## F-statistic: 22.28 on 6 and 43 DF, p-value: 9.93e-12
par(mfrow=c(2,2))
plot(lm.fit)
car::vif(lm.fit)
## GVIF Df GVIF^(1/(2*Df))
## region 2.437918 3 1.160121
## UrbanPop 1.381098 1 1.175201
## Assault 3.514233 1 1.874629
## Murder 4.132841 1 2.032939
glm.fit <- glm(discoveries~LakeHuron+sunspot.year, family=poisson,
data=ts.intersect(LakeHuron, discoveries, sunspot.year))
summary(glm.fit)
##
## Call:
## glm(formula = discoveries ~ LakeHuron + sunspot.year, family = poisson,
## data = ts.intersect(LakeHuron, discoveries, sunspot.year))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7390 -0.9204 -0.1821 0.7727 2.9947
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -82.373090 28.009959 -2.941 0.00327 **
## LakeHuron 0.144482 0.048321 2.990 0.00279 **
## sunspot.year -0.003041 0.001609 -1.890 0.05878 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 140.98 on 84 degrees of freedom
## Residual deviance: 124.93 on 82 degrees of freedom
## AIC: 364.37
##
## Number of Fisher Scoring iterations: 5
tidy is the “new” style (for how you should do many [but not all!] things)
library()
rm(list=ls()) doesn’t remove loaded packagestibbles rather than data.frames
tibbles are an improvement%>% (pass output to the next function! Very cool.)
%>% “notation/syntax” in R is a bit weird; but, super useful anywaydplyr::mutate is AWESOME
dplyr::, but I’m being explicit and doing sotidyr::pivot_longer style is quite powerful… tricky to figure out, thoughlibrary(tidyverse)
# overwrite this
USArrests.V2 %>% as_tibble() -> USArrests.V2
# USArrests.V2 <- USArrests.V2 %>% as_tibble()
USArrests.V2 %>%
dplyr::mutate(sqrtAssault=sqrt(Assault)) %>%
tidyr::pivot_longer(cols=c(Murder, sqrtAssault, Rape)) -> USArrests.V2.tall
USArrests.V2.tall
## # A tibble: 150 x 6
## Row.names Assault UrbanPop region name value
## <I<chr>> <int> <int> <fct> <chr> <dbl>
## 1 Alabama 236 58 South Murder 13.2
## 2 Alabama 236 58 South sqrtAssault 15.4
## 3 Alabama 236 58 South Rape 21.2
## 4 Alaska 263 48 West Murder 10
## 5 Alaska 263 48 West sqrtAssault 16.2
## 6 Alaska 263 48 West Rape 44.5
## 7 Arizona 294 80 West Murder 8.1
## 8 Arizona 294 80 West sqrtAssault 17.1
## 9 Arizona 294 80 West Rape 31
## 10 Arkansas 190 50 South Murder 8.8
## # … with 140 more rows
library(ggplot2)
USArrests.V2.tall %>% ggplot(aes(x=UrbanPop, y=value, color=name)) +
geom_point() + facet_wrap(~region) +
stat_smooth(method='loess', formula='y~x', se=FALSE, alpha=0.1)
library(ggbeeswarm)
USArrests.V2.tall %>%
ggplot2::ggplot(aes(x=name, y=value, color=name)) + geom_boxplot() +
geom_beeswarm(size = 3, cex = 3)
USArrests.V2.tall %>%
ggplot2::ggplot(aes(x=name, y=value, color=name)) + geom_violin() +
geom_dotplot(binaxis='y', binwidth=3, stackdir='center', dotsize=.3, fill=NA)
library(GGally)
ggpairs(USArrests.V2 %>% dplyr::select(2:5))
# https://stackoverflow.com/questions/7714677/scatterplot-with-too-many-points
pairwise_density <- function(data, mapping, ...){
ggplot(data=data, mapping=mapping) + geom_point() + geom_density_2d()
}
ggpairs(USArrests.V2 %>% dplyr::select(2:5),
lower=list(continuous=pairwise_density))
pairwise_density <- function(data, mapping, ...){
ggplot(data=data, mapping=mapping) +
stat_density_2d(aes(fill = stat(density)), geom='raster', contour=FALSE) +
scale_fill_viridis_c() + coord_cartesian(expand=FALSE) +
geom_point(shape='.', col='white', size=5)
}
ggpairs(USArrests.V2 %>% dplyr::select(2:5),
lower=list(continuous=pairwise_density))
pairwise_density <- function(data, mapping, ...){
ggplot(data=data, mapping=mapping) + stat_bin_hex(bins=7) +
geom_point(shape='.', col='white', size=5)
}
ggpairs(USArrests.V2 %>% dplyr::select(2:5),
lower=list(continuous=pairwise_density))
library(plotly)
plot_ly(USArrests.V2, x=~Assault, y=~Murder, z=~Rape, color=~UrbanPop,
type='scatter3d', mode='markers',
hovertext=~paste0(Row.names,' (',region,')'),
hovertemplate = paste('%{hovertext}<br>',
'Assault: %{x}<br>',
'Murder: %{y}<br>',
'Rape: %{z}<br><extra></extra>'))
# do tests for all region pairs and 3 variables for each
n_tests <- choose(4,2)*3
# factorial(4)/(factorial(2)*factorial(2))
USArrests.V2.tall %>%
dplyr::group_split(name, .keep=FALSE) %>%
purrr::set_names(USArrests.V2.tall %>%
dplyr::group_keys(name) %>%
as.matrix() %>% as.list()) %>%
purrr::map(~ pairwise.t.test(.x$value, .x$region, p.adj='none')) %>%
purrr::imap(~ broom::tidy(.x) %>% add_column(outcome=.y)) %>%
dplyr::bind_rows() %>%
dplyr::mutate(p.value_bonf.adj = p.adjust(p.value, 'bonferroni', n=n_tests))
## # A tibble: 18 x 5
## group1 group2 p.value outcome p.value_bonf.adj
## <chr> <chr> <dbl> <chr> <dbl>
## 1 South Northeast 0.0000117 Murder 0.000210
## 2 North Central Northeast 0.511 Murder 1
## 3 North Central South 0.0000334 Murder 0.000602
## 4 West Northeast 0.123 Murder 1
## 5 West South 0.000648 Murder 0.0117
## 6 West North Central 0.336 Murder 1
## 7 South Northeast 0.0308 Rape 0.554
## 8 North Central Northeast 0.190 Rape 1
## 9 North Central South 0.375 Rape 1
## 10 West Northeast 0.0000579 Rape 0.00104
## 11 West South 0.0108 Rape 0.194
## 12 West North Central 0.00170 Rape 0.0306
## 13 South Northeast 0.00413 sqrtAssault 0.0743
## 14 North Central Northeast 0.780 sqrtAssault 1
## 15 North Central South 0.000734 sqrtAssault 0.0132
## 16 West Northeast 0.0630 sqrtAssault 1
## 17 West South 0.254 sqrtAssault 1
## 18 West North Central 0.0219 sqrtAssault 0.393
data('iris')
library('recipes')
iris_recipe <- recipe(Species~Sepal.Width+Petal.Width, data=iris) %>%
step_center(-Species) %>%
step_scale(-Species) %>%
prep()
#iris_baked <- bake(iris_recipe, iris)
library('parsnip')
random_forest_classifier <- rand_forest(mtry=tune(), trees=tune()) %>%
set_engine("ranger") %>%
set_mode("classification")
#rf_fit <- random_forest_classifier %>%
# fit(Species~., data = iris_baked)
support_vector_machine_classifier <- svm_rbf(cost=tune(), rbf_sigma=tune()) %>%
set_engine("kernlab") %>%
set_mode("classification")
#svm_fit <- support_vector_machine_classifier %>%
# fit(Species~., data = iris_baked)
library('rsample')
library('tune')
library('workflows')
# RF
iris_rf_workflow <- workflow() %>%
add_recipe(iris_recipe) %>%
add_model(random_forest_classifier)
rf_search <- iris_rf_workflow %>%
tune_grid(resamples=vfold_cv(iris, v=5), grid=7)
#show_best(search, metric="roc_auc")
best_rf_params <- rf_search %>%
select_best(metric = "roc_auc")
rf_finalized <- iris_rf_workflow %>%
finalize_workflow(best_rf_params) %>%
fit(data=iris)
# SVM
iris_svm_workflow <- workflow() %>%
add_recipe(iris_recipe) %>%
add_model(support_vector_machine_classifier)
svm_search <- iris_svm_workflow %>%
tune_grid(resamples=vfold_cv(iris, v=5), grid=7)
#show_best(search, metric="roc_auc")
best_svm_params <- svm_search %>%
select_best(metric = "roc_auc")
svm_finalized <- iris_svm_workflow %>%
finalize_workflow(best_svm_params) %>%
fit(data=iris)
# grid
x <- modelr::seq_range(iris$Sepal.Width, n=30)
y <- modelr::seq_range(iris$Petal.Width, n=30)
xy <- tidyr::expand_grid(x,y) %>%
purrr::set_names(list('Sepal.Width', 'Petal.Width'))
rf_decision_boundary <- predict(rf_finalized, new_data=xy, type='prob') %>%
tibble::rowid_to_column() %>%
left_join(tibble::rowid_to_column(xy)) %>%
rowwise() %>%
mutate(max = max(.pred_setosa, .pred_versicolor, .pred_virginica))
ggplot() +
geom_density_2d(rf_decision_boundary%>%filter(.pred_setosa==max),
mapping=aes(x=Sepal.Width, y=Petal.Width, color='setosa'),
alpha=0.3)+
geom_density_2d(rf_decision_boundary%>%filter(.pred_versicolor==max),
mapping=aes(x=Sepal.Width, y=Petal.Width, color='versicolor'),
alpha=0.3) +
geom_density_2d(rf_decision_boundary%>%filter(.pred_virginica==max),
mapping=aes(x=Sepal.Width, y=Petal.Width, color='virginica'),
alpha=0.3) +
geom_point(tibble(iris),
mapping=aes(x=Sepal.Width, y=Petal.Width, color=Species), size=2) +
ggtitle('Random Forest Classifier')
svm_decision_boundary <- predict(svm_finalized, new_data=xy, type='prob') %>%
tibble::rowid_to_column() %>%
left_join(tibble::rowid_to_column(xy)) %>%
rowwise() %>%
mutate(max = max(.pred_setosa, .pred_versicolor, .pred_virginica))
ggplot() +
geom_density_2d_filled(svm_decision_boundary%>%filter(.pred_setosa==max),
mapping=aes(x=Sepal.Width, y=Petal.Width, color='setosa',
fill='setosa', alpha= ..level..)) +
geom_density_2d_filled(svm_decision_boundary%>%filter(.pred_versicolor==max),
mapping=aes(x=Sepal.Width, y=Petal.Width, color='versicolor',
fill='versicolor', alpha= ..level..)) +
geom_density_2d_filled(svm_decision_boundary%>%filter(.pred_virginica==max),
mapping=aes(x=Sepal.Width, y=Petal.Width, color='virginica',
fill='virginica', alpha= ..level..)) +
geom_point(tibble(iris),
mapping=aes(x=Sepal.Width, y=Petal.Width, fill=Species),
size=4, pch=23, color='black') +
ggtitle('Support Vector Machine Classifier')